use "Data\Compustat_20231027\1950_1979.dta",clear //Import raw data

keep gvkey datadate fyear indfmt sale emp xrd sic sich naics naicsh

append using "Data\Compustat_20231027\1980_1989.dta" "Data\Compustat_20231027\1990_1999.dta" "Data\Compustat_20231027\2000_2009.dta" "Data\Compustat_20231027\2010_2019.dta" "Data\Compustat_20231027\2020_2023.dta", keep(gvkey datadate fyear indfmt sale emp xrd sic sich naics naicsh)

gen datayear=year(datadate)
gen datamonth=month(datadate)
gen year=datayear if datamonth>=6 & datamonth<=12
replace year=datayear-1 if datamonth>=1 & datamonth<=5
keep if indfmt=="INDL"
bys gvkey year: egen date_max=max(datadate)
keep if datadate==date_max /*28 obs deleted*/
duplicates report gvkey year

preserve
keep if year>=1972 & year<=2006
gen xrd_indicator=0 
replace xrd_indicator=1 if xrd>0 & xrd!=.
collapse (max) xrd_indicator, by( gvkey )
destring gvkey,replace
merge m:1 gvkey using "Data\Patent_Data\pdpcohdr.dta"
drop if _m==2
drop _m
egen rd_indicator=rowmax(xrd_indicator match)
keep gvkey xrd_indicator rd_indicator
save "Data\rd_indicator.dta",replace
restore 

replace sale=0 if sale<0

gen rd_ratio=xrd/sale
replace rd_ratio=0 if xrd==0
replace rd_ratio=0 if xrd==.

egen rd_ratio_99=pctile(rd_ratio), p(99)
replace rd_ratio=. if rd_ratio>rd_ratio_99

gen large=1 if emp>=1 & emp~=.
replace large=0 if emp<1

gen size=1 if emp<1
replace size=2 if emp>=1 & emp<10
replace size=3 if emp>=10 & emp~=.

egen sale_50=pctile(sale), p(50)
gen high_sale=1 if sale>sale_50 & sale~=.
replace high_sale=0 if sale<sale_50 

destring gvkey, replace
merge m:1 gvkey using "Data\rd_indicator.dta"
drop if _m==2
drop _m

save "Data\RD_Sale.dta",replace